#!/usr/bin/env python

import argparse
import os
import time
import datetime
import json
import pandas as pd
import collections
import gzip
import cPickle as pickle
import itertools
import re


parser = argparse.ArgumentParser(
    description='Select variables from text file, line-by-line')
parser.add_argument(
    '--info-file',
    metavar='INFO',
    default='/home/whobbs/workspace/cavr_wkdir/CalVoter_scripts/' +\
        'california_files_json.txt',
    help='information about variables, file location, and input separator'
    )
parser.add_argument(
    '--out-directory',
    metavar='DIR',
    default='~/workspace/cavr_wkdir/CalVoter_files/retrynames/',
    help='local directory to write outfile'
    )
parser.add_argument(
    '--chunksize',
    metavar='CHUNK',
    type=int,
    default=100000,
    help='rows per iteration'
    )
parser.add_argument(
    '--first-run',
    metavar='FIRST',
    default=True,
    help='rows per iteration'
    )
args = parser.parse_args()

json_data = open(os.path.expanduser(args.info_file))
file_info = json.load(json_data)

the_chunksize = args.chunksize

####
with gzip.open(
    os.path.expanduser(
        args.out_directory +\
            "all_ineligible_pids_" + \
            file_info.keys()[0][0:4] + \
            ".pgz"
        ), 'rb') as f:
    all_ineligible_pids = pickle.load(f)
    
#### pull out the still eligible HIDs and write to file, by file-year
print "Filtering households and creating ego-alter associations:"
for the_year in sorted(file_info.keys(), reverse=False):
    date_loc = re.split(r'-|/', file_info[the_year][0]['dateformat']).index("YYYY")
    start = time.time()
    print "  " + the_year + ".. " + " current time: " + \
	datetime.datetime.fromtimestamp(
        time.time()
        ).strftime('%Y-%m-%d %H:%M:%S')
    with gzip.open(
        os.path.expanduser(
            args.out_directory +\
                "hid_to_pid_" + \
                the_year + \
                ".pgz"
            ), 'rb'
        ) as f:
        hid_to_pid = pickle.load(f)

    print "    collecting all eligible households.."
    pid_to_hid = collections.defaultdict()
    # # flip hid to pid (to PID:HID)
    for k, v in hid_to_pid.items():
        for subk, subv in v.items():
            pid_to_hid[subk] = k

    ineligible_hids = [
        pid_to_hid[pid] for pid in set(
            pid_to_hid.keys()
            ).intersection(all_ineligible_pids)
        ]
    del pid_to_hid
    hid_set = set(hid_to_pid.keys()) - set(ineligible_hids)
    eligible_hids = {hid:hid_to_pid[hid] for hid in list(hid_set)}
    del hid_to_pid
    del ineligible_hids
    del hid_set

    # print "      pickling.."
    # with gzip.open(
    #     os.path.expanduser(
    #         args.out_directory +\
    #             "eligible_hids_" + \
    #             the_year + \
    #             ".pgz"
    #         ), 'wb') as f:
    #     pickle.dump(eligible_hids, f)



    print "    creating ego-alter associations.."
    # household_associations = []
    i = 1


    for k, v in eligible_hids.iteritems():
        if len(v) > 1:
            household_associations = list(itertools.permutations(v.items(), 2))
            household_associations = pd.DataFrame(household_associations)
            household_associations = pd.concat(
                [pd.DataFrame(household_associations.iloc[:,0].tolist()),
                 pd.DataFrame(household_associations.iloc[:,1].tolist())
                 ], 
                axis=1
                )
            household_associations.columns = ['eID','eDOB','aID','aDOB']
            household_associations['OCC'] = len(v)
            household_associations['DOBdiff'] = pd.to_datetime(household_associations['eDOB']) - pd.to_datetime(household_associations['aDOB'])
            try:
                household_associations['DOBdiff'] = household_associations['DOBdiff'].apply(lambda x: x.astype('timedelta64[Y]'))
            except:
                pass
            samegen_count = household_associations[(abs(household_associations['DOBdiff']) < 15)][['eID']].groupby('eID').count()
            if len(samegen_count) > 0:
                samegen_count.columns = ['eGENsame']
                samegen_count = samegen_count.reset_index()
                household_associations = household_associations.merge(samegen_count, how='outer')
                samegen_count.columns = ['aID','aGENsame']
                household_associations = household_associations.merge(samegen_count, how='outer')
                household_associations['eGENsame'].fillna(0, inplace=True)
                household_associations['aGENsame'].fillna(0, inplace=True)
            else:
                household_associations['eGENsame'] = 0
                household_associations['aGENsame'] = 0
            household_associations['eGENsame'] = household_associations['eGENsame'].astype(int)
            household_associations['aGENsame'] = household_associations['aGENsame'].astype(int)
            household_associations.columns = [
                x+'_'+the_year for x in household_associations.columns
            ]
            household_associations.columns = [x.replace('ID_'+the_year,'ID') for x in household_associations.columns]
            household_associations.to_csv(
                os.path.expanduser(
                    args.out_directory +\
                        "pid_household_associations_" + \
                        the_year + ".csv"), 
                header=True if i == 1 else False, 
                mode='w' if i == 1 else 'a', index=False
                )
            # else:
            #     household_associations.to_csv(
            #         os.path.expanduser(
            #             args.out_directory +\
            #                 "pid_household_associations_" + \
            #                 the_year + ".csv"), 
            #         mode='a', header=False, index=False
            #         )
        i+=1

    del eligible_hids



    print "  Done." + " (time spent on file-year: " + str(datetime.timedelta(seconds=time.time() - start)) + ".)\n"

print "Done (EOF). (current time: " +\
    datetime.datetime.fromtimestamp(
        time.time()
        ).strftime('%Y-%m-%d %H:%M:%S') + ".)\n"
